1   package org.apache.lucene.util.automaton;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.util.ArrayList;
21  import java.util.HashMap;
22  import java.util.HashSet;
23  import java.util.LinkedList;
24  import java.util.List;
25  import java.util.Map;
26  import java.util.Random;
27  import java.util.Set;
28  
29  import org.apache.lucene.util.ArrayUtil;
30  import org.apache.lucene.util.IntsRef;
31  import org.apache.lucene.util.IntsRefBuilder;
32  import org.apache.lucene.util.TestUtil;
33  import org.apache.lucene.util.UnicodeUtil;
34  
35  /**
36   * Utilities for testing automata.
37   * <p>
38   * Capable of generating random regular expressions,
39   * and automata, and also provides a number of very
40   * basic unoptimized implementations (*slow) for testing.
41   */
42  public class AutomatonTestUtil {
43    /**
44     * Default maximum number of states that {@link Operations#determinize} should create.
45     */
46    public static final int DEFAULT_MAX_DETERMINIZED_STATES = 1000000;
47  
48    /** Returns random string, including full unicode range. */
49    public static String randomRegexp(Random r) {
50      while (true) {
51        String regexp = randomRegexpString(r);
52        // we will also generate some undefined unicode queries
53        if (!UnicodeUtil.validUTF16String(regexp))
54          continue;
55        try {
56          new RegExp(regexp, RegExp.NONE);
57          return regexp;
58        } catch (Exception e) {}
59      }
60    }
61  
62    private static String randomRegexpString(Random r) {
63      final int end = r.nextInt(20);
64      if (end == 0) {
65        // allow 0 length
66        return "";
67      }
68      final char[] buffer = new char[end];
69      for (int i = 0; i < end; i++) {
70        int t = r.nextInt(15);
71        if (0 == t && i < end - 1) {
72          // Make a surrogate pair
73          // High surrogate
74          buffer[i++] = (char) TestUtil.nextInt(r, 0xd800, 0xdbff);
75          // Low surrogate
76          buffer[i] = (char) TestUtil.nextInt(r, 0xdc00, 0xdfff);
77        }
78        else if (t <= 1) buffer[i] = (char) r.nextInt(0x80);
79        else if (2 == t) buffer[i] = (char) TestUtil.nextInt(r, 0x80, 0x800);
80        else if (3 == t) buffer[i] = (char) TestUtil.nextInt(r, 0x800, 0xd7ff);
81        else if (4 == t) buffer[i] = (char) TestUtil.nextInt(r, 0xe000, 0xffff);
82        else if (5 == t) buffer[i] = '.';
83        else if (6 == t) buffer[i] = '?';
84        else if (7 == t) buffer[i] = '*';
85        else if (8 == t) buffer[i] = '+';
86        else if (9 == t) buffer[i] = '(';
87        else if (10 == t) buffer[i] = ')';
88        else if (11 == t) buffer[i] = '-';
89        else if (12 == t) buffer[i] = '[';
90        else if (13 == t) buffer[i] = ']';
91        else if (14 == t) buffer[i] = '|';
92      }
93      return new String(buffer, 0, end);
94    }
95    
96    /** picks a random int code point, avoiding surrogates;
97     * throws IllegalArgumentException if this transition only
98     * accepts surrogates */
99    private static int getRandomCodePoint(final Random r, int min, int max) {
100     final int code;
101     if (max < UnicodeUtil.UNI_SUR_HIGH_START ||
102         min > UnicodeUtil.UNI_SUR_HIGH_END) {
103       // easy: entire range is before or after surrogates
104       code = min+r.nextInt(max-min+1);
105     } else if (min >= UnicodeUtil.UNI_SUR_HIGH_START) {
106       if (max > UnicodeUtil.UNI_SUR_LOW_END) {
107         // after surrogates
108         code = 1+UnicodeUtil.UNI_SUR_LOW_END+r.nextInt(max-UnicodeUtil.UNI_SUR_LOW_END);
109       } else {
110         throw new IllegalArgumentException("transition accepts only surrogates: min=" + min + " max=" + max);
111       }
112     } else if (max <= UnicodeUtil.UNI_SUR_LOW_END) {
113       if (min < UnicodeUtil.UNI_SUR_HIGH_START) {
114         // before surrogates
115         code = min + r.nextInt(UnicodeUtil.UNI_SUR_HIGH_START - min);
116       } else {
117         throw new IllegalArgumentException("transition accepts only surrogates: min=" + min + " max=" + max);
118       }
119     } else {
120       // range includes all surrogates
121       int gap1 = UnicodeUtil.UNI_SUR_HIGH_START - min;
122       int gap2 = max - UnicodeUtil.UNI_SUR_LOW_END;
123       int c = r.nextInt(gap1+gap2);
124       if (c < gap1) {
125         code = min + c;
126       } else {
127         code = UnicodeUtil.UNI_SUR_LOW_END + c - gap1 + 1;
128       }
129     }
130 
131     assert code >= min && code <= max && (code < UnicodeUtil.UNI_SUR_HIGH_START || code > UnicodeUtil.UNI_SUR_LOW_END):
132       "code=" + code + " min=" + min + " max=" + max;
133     return code;
134   }
135 
136   /**
137    * Lets you retrieve random strings accepted
138    * by an Automaton.
139    * <p>
140    * Once created, call {@link #getRandomAcceptedString(Random)}
141    * to get a new string (in UTF-32 codepoints).
142    */
143   public static class RandomAcceptedStrings {
144 
145     private final Map<Transition,Boolean> leadsToAccept;
146     private final Automaton a;
147     private final Transition[][] transitions;
148 
149     private static class ArrivingTransition {
150       final int from;
151       final Transition t;
152 
153       public ArrivingTransition(int from, Transition t) {
154         this.from = from;
155         this.t = t;
156       }
157     }
158 
159     public RandomAcceptedStrings(Automaton a) {
160       this.a = a;
161       if (a.getNumStates() == 0) {
162         throw new IllegalArgumentException("this automaton accepts nothing");
163       }
164       this.transitions = a.getSortedTransitions();
165 
166       leadsToAccept = new HashMap<>();
167       final Map<Integer,List<ArrivingTransition>> allArriving = new HashMap<>();
168 
169       final LinkedList<Integer> q = new LinkedList<>();
170       final Set<Integer> seen = new HashSet<>();
171 
172       // reverse map the transitions, so we can quickly look
173       // up all arriving transitions to a given state
174       int numStates = a.getNumStates();
175       for(int s=0;s<numStates;s++) {
176         for(Transition t : transitions[s]) {
177           List<ArrivingTransition> tl = allArriving.get(t.dest);
178           if (tl == null) {
179             tl = new ArrayList<>();
180             allArriving.put(t.dest, tl);
181           }
182           tl.add(new ArrivingTransition(s, t));
183         }
184         if (a.isAccept(s)) {
185           q.add(s);
186           seen.add(s);
187         }
188       }
189 
190       // Breadth-first search, from accept states,
191       // backwards:
192       while (q.isEmpty() == false) {
193         final int s = q.removeFirst();
194         List<ArrivingTransition> arriving = allArriving.get(s);
195         if (arriving != null) {
196           for(ArrivingTransition at : arriving) {
197             final int from = at.from;
198             if (!seen.contains(from)) {
199               q.add(from);
200               seen.add(from);
201               leadsToAccept.put(at.t, Boolean.TRUE);
202             }
203           }
204         }
205       }
206     }
207 
208     public int[] getRandomAcceptedString(Random r) {
209 
210       final List<Integer> soFar = new ArrayList<>();
211 
212       int s = 0;
213 
214       while(true) {
215       
216         if (a.isAccept(s)) {
217           if (a.getNumTransitions(s) == 0) {
218             // stop now
219             break;
220           } else {
221             if (r.nextBoolean()) {
222               break;
223             }
224           }
225         }
226 
227         if (a.getNumTransitions(s) == 0) {
228           throw new RuntimeException("this automaton has dead states");
229         }
230 
231         boolean cheat = r.nextBoolean();
232 
233         final Transition t;
234         if (cheat) {
235           // pick a transition that we know is the fastest
236           // path to an accept state
237           List<Transition> toAccept = new ArrayList<>();
238           for(Transition t0 : transitions[s]) {
239             if (leadsToAccept.containsKey(t0)) {
240               toAccept.add(t0);
241             }
242           }
243           if (toAccept.size() == 0) {
244             // this is OK -- it means we jumped into a cycle
245             t = transitions[s][r.nextInt(transitions[s].length)];
246           } else {
247             t = toAccept.get(r.nextInt(toAccept.size()));
248           }
249         } else {
250           t = transitions[s][r.nextInt(transitions[s].length)];
251         }
252         soFar.add(getRandomCodePoint(r, t.min, t.max));
253         s = t.dest;
254       }
255 
256       return ArrayUtil.toIntArray(soFar);
257     }
258   }
259 
260   private static Automaton randomSingleAutomaton(Random random) {
261     while (true) {
262       try {
263         Automaton a1 = new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton();
264         if (random.nextBoolean()) {
265           a1 = Operations.complement(a1, DEFAULT_MAX_DETERMINIZED_STATES);
266         }
267         return a1;
268       } catch (TooComplexToDeterminizeException tctde) {
269         // This can (rarely) happen if the random regexp is too hard; just try again...
270       }
271     }
272   }
273   
274   /** return a random NFA/DFA for testing */
275   public static Automaton randomAutomaton(Random random) {
276     // get two random Automata from regexps
277     Automaton a1 = randomSingleAutomaton(random);
278     Automaton a2 = randomSingleAutomaton(random);
279 
280     // combine them in random ways
281     switch (random.nextInt(4)) {
282       case 0: return Operations.concatenate(a1, a2);
283       case 1: return Operations.union(a1, a2);
284       case 2: return Operations.intersection(a1, a2);
285       default: return Operations.minus(a1, a2, DEFAULT_MAX_DETERMINIZED_STATES);
286     }
287   }
288   
289   /** 
290    * below are original, unoptimized implementations of DFA operations for testing.
291    * These are from brics automaton, full license (BSD) below:
292    */
293   
294   /*
295    * dk.brics.automaton
296    * 
297    * Copyright (c) 2001-2009 Anders Moeller
298    * All rights reserved.
299    * 
300    * Redistribution and use in source and binary forms, with or without
301    * modification, are permitted provided that the following conditions
302    * are met:
303    * 1. Redistributions of source code must retain the above copyright
304    *    notice, this list of conditions and the following disclaimer.
305    * 2. Redistributions in binary form must reproduce the above copyright
306    *    notice, this list of conditions and the following disclaimer in the
307    *    documentation and/or other materials provided with the distribution.
308    * 3. The name of the author may not be used to endorse or promote products
309    *    derived from this software without specific prior written permission.
310    * 
311    * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
312    * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
313    * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
314    * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
315    * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
316    * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
317    * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
318    * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
319    * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
320    * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
321    */
322 
323   /**
324    * Simple, original brics implementation of Brzozowski minimize()
325    */
326   public static Automaton minimizeSimple(Automaton a) {
327     Set<Integer> initialSet = new HashSet<Integer>();
328     a = determinizeSimple(Operations.reverse(a, initialSet), initialSet);
329     initialSet.clear();
330     a = determinizeSimple(Operations.reverse(a, initialSet), initialSet);
331     return a;
332   }
333   
334   /**
335    * Simple, original brics implementation of determinize()
336    */
337   public static Automaton determinizeSimple(Automaton a) {
338     Set<Integer> initialset = new HashSet<>();
339     initialset.add(0);
340     return determinizeSimple(a, initialset);
341   }
342 
343   /** 
344    * Simple, original brics implementation of determinize()
345    * Determinizes the given automaton using the given set of initial states. 
346    */
347   public static Automaton determinizeSimple(Automaton a, Set<Integer> initialset) {
348     if (a.getNumStates() == 0) {
349       return a;
350     }
351     int[] points = a.getStartPoints();
352     // subset construction
353     Map<Set<Integer>, Set<Integer>> sets = new HashMap<>();
354     LinkedList<Set<Integer>> worklist = new LinkedList<>();
355     Map<Set<Integer>, Integer> newstate = new HashMap<>();
356     sets.put(initialset, initialset);
357     worklist.add(initialset);
358     Automaton.Builder result = new Automaton.Builder();
359     result.createState();
360     newstate.put(initialset, 0);
361     Transition t = new Transition();
362     while (worklist.size() > 0) {
363       Set<Integer> s = worklist.removeFirst();
364       int r = newstate.get(s);
365       for (int q : s) {
366         if (a.isAccept(q)) {
367           result.setAccept(r, true);
368           break;
369         }
370       }
371       for (int n = 0; n < points.length; n++) {
372         Set<Integer> p = new HashSet<>();
373         for (int q : s) {
374           int count = a.initTransition(q, t);
375           for(int i=0;i<count;i++) {
376             a.getNextTransition(t);
377             if (t.min <= points[n] && points[n] <= t.max) {
378               p.add(t.dest);
379             }
380           }
381         }
382 
383         if (!sets.containsKey(p)) {
384           sets.put(p, p);
385           worklist.add(p);
386           newstate.put(p, result.createState());
387         }
388         int q = newstate.get(p);
389         int min = points[n];
390         int max;
391         if (n + 1 < points.length) {
392           max = points[n + 1] - 1;
393         } else {
394           max = Character.MAX_CODE_POINT;
395         }
396         result.addTransition(r, q, min, max);
397       }
398     }
399 
400     return Operations.removeDeadStates(result.finish());
401   }
402 
403   /**
404    * Simple, original implementation of getFiniteStrings.
405    *
406    * <p>Returns the set of accepted strings, assuming that at most
407    * <code>limit</code> strings are accepted. If more than <code>limit</code> 
408    * strings are accepted, the first limit strings found are returned. If <code>limit</code>&lt;0, then 
409    * the limit is infinite.
410    *
411    * <p>This implementation is recursive: it uses one stack
412    * frame for each digit in the returned strings (ie, max
413    * is the max length returned string).
414    */
415   public static Set<IntsRef> getFiniteStringsRecursive(Automaton a, int limit) {
416     HashSet<IntsRef> strings = new HashSet<>();
417     if (!getFiniteStrings(a, 0, new HashSet<Integer>(), strings, new IntsRefBuilder(), limit)) {
418       return strings;
419     }
420     return strings;
421   }
422 
423   /**
424    * Returns the strings that can be produced from the given state, or
425    * false if more than <code>limit</code> strings are found. 
426    * <code>limit</code>&lt;0 means "infinite".
427    */
428   private static boolean getFiniteStrings(Automaton a, int s, HashSet<Integer> pathstates, 
429       HashSet<IntsRef> strings, IntsRefBuilder path, int limit) {
430     pathstates.add(s);
431     Transition t = new Transition();
432     int count = a.initTransition(s, t);
433     for (int i=0;i<count;i++) {
434       a.getNextTransition(t);
435       if (pathstates.contains(t.dest)) {
436         return false;
437       }
438       for (int n = t.min; n <= t.max; n++) {
439         path.append(n);
440         if (a.isAccept(t.dest)) {
441           strings.add(path.toIntsRef());
442           if (limit >= 0 && strings.size() > limit) {
443             return false;
444           }
445         }
446         if (!getFiniteStrings(a, t.dest, pathstates, strings, path, limit)) {
447           return false;
448         }
449         path.setLength(path.length() - 1);
450       }
451     }
452     pathstates.remove(s);
453     return true;
454   }
455 
456   /**
457    * Returns true if the language of this automaton is finite.
458    * <p>
459    * WARNING: this method is slow, it will blow up if the automaton is large.
460    * this is only used to test the correctness of our faster implementation.
461    */
462   public static boolean isFiniteSlow(Automaton a) {
463     if (a.getNumStates() == 0) {
464       return true;
465     }
466     return isFiniteSlow(a, 0, new HashSet<Integer>());
467   }
468   
469   /**
470    * Checks whether there is a loop containing s. (This is sufficient since
471    * there are never transitions to dead states.)
472    */
473   // TODO: not great that this is recursive... in theory a
474   // large automata could exceed java's stack
475   private static boolean isFiniteSlow(Automaton a, int s, HashSet<Integer> path) {
476     path.add(s);
477     Transition t = new Transition();
478     int count = a.initTransition(s, t);
479     for (int i=0;i<count;i++) {
480       a.getNextTransition(t);
481       if (path.contains(t.dest) || !isFiniteSlow(a, t.dest, path)) {
482         return false;
483       }
484     }
485     path.remove(s);
486     return true;
487   }
488   
489   /**
490    * Checks that an automaton has no detached states that are unreachable
491    * from the initial state.
492    */
493   public static void assertNoDetachedStates(Automaton a) {
494     Automaton a2 = Operations.removeDeadStates(a);
495     assert a.getNumStates() == a2.getNumStates() : "automaton has " + (a.getNumStates() - a2.getNumStates()) + " detached states";
496   }
497 
498   /** Returns true if the automaton is deterministic. */
499   public static boolean isDeterministicSlow(Automaton a) {
500     Transition t = new Transition();
501     int numStates = a.getNumStates();
502     for(int s=0;s<numStates;s++) {
503       int count = a.initTransition(s, t);
504       int lastMax = -1;
505       for(int i=0;i<count;i++) {
506         a.getNextTransition(t);
507         if (t.min <= lastMax) {
508           assert a.isDeterministic() == false;
509           return false;
510         }
511         lastMax = t.max;
512       }
513     }
514 
515     assert a.isDeterministic() == true;
516     return true;
517   }
518   
519 }